<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang>
<head><meta name="generator" content="Hexo 3.8.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">









<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
















  
  
  <link href="/hcigmoid/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">




  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  

  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic|Lato:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/hcigmoid/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">

<link href="/hcigmoid/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">


  <link rel="apple-touch-icon" sizes="180x180" href="/hcigmoid/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/hcigmoid/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/hcigmoid/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/hcigmoid/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="推荐,协同过滤,算法,相似度,二部图,">





  <link rel="alternate" href="/hcigmoid/atom.xml" title="HCigmoid" type="application/atom+xml">






<meta name="description" content="一、应用背景最近需要对视频的相关推荐进行一些优化。之前尝试过 TagSim、AutoEncoder 和 Word2Vec 等方法，无非是基于元数据相似或基于协同相似的思路。但是在实际应用的时候，由于媒资传过来的信息未必是非常准确的，因此基于元数据相似的方法在数据基础上可能就存在一定的不确定性，因此常常会推出来一些虽然实际上很符合算法预期，但是看起来很奇怪的结果。而基于协同相似的推荐，由于需要比较多">
<meta name="keywords" content="推荐,协同过滤,算法,相似度,二部图">
<meta property="og:type" content="article">
<meta property="og:title" content="SimRank与视频相似度计算">
<meta property="og:url" content="http://guyuecanhui.gitee.io/hcigmoid/2019/04/29/simrank/index.html">
<meta property="og:site_name" content="HCigmoid">
<meta property="og:description" content="一、应用背景最近需要对视频的相关推荐进行一些优化。之前尝试过 TagSim、AutoEncoder 和 Word2Vec 等方法，无非是基于元数据相似或基于协同相似的思路。但是在实际应用的时候，由于媒资传过来的信息未必是非常准确的，因此基于元数据相似的方法在数据基础上可能就存在一定的不确定性，因此常常会推出来一些虽然实际上很符合算法预期，但是看起来很奇怪的结果。而基于协同相似的推荐，由于需要比较多">
<meta property="og:locale" content="default">
<meta property="og:image" content="http://guyuecanhui.gitee.io/hcigmoid/2019/04/29/simrank/simrank-f1.png">
<meta property="og:image" content="http://guyuecanhui.gitee.io/hcigmoid/2019/04/29/simrank/simrank-f2.png">
<meta property="og:updated_time" content="2020-06-10T13:23:40.758Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="SimRank与视频相似度计算">
<meta name="twitter:description" content="一、应用背景最近需要对视频的相关推荐进行一些优化。之前尝试过 TagSim、AutoEncoder 和 Word2Vec 等方法，无非是基于元数据相似或基于协同相似的思路。但是在实际应用的时候，由于媒资传过来的信息未必是非常准确的，因此基于元数据相似的方法在数据基础上可能就存在一定的不确定性，因此常常会推出来一些虽然实际上很符合算法预期，但是看起来很奇怪的结果。而基于协同相似的推荐，由于需要比较多">
<meta name="twitter:image" content="http://guyuecanhui.gitee.io/hcigmoid/2019/04/29/simrank/simrank-f1.png">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/hcigmoid/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://guyuecanhui.gitee.io/hcigmoid/2019/04/29/simrank/">





  <title>SimRank与视频相似度计算 | HCigmoid</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="default">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/hcigmoid/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">HCigmoid</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle">Watch, learn and practise</p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/hcigmoid/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br>
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/hcigmoid/about/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-user"></i> <br>
            
            关于
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/hcigmoid/tags/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/hcigmoid/categories/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br>
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/hcigmoid/archives/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
            
            归档
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br>
            
            搜索
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://guyuecanhui.gitee.io/hcigmoid/hcigmoid/2019/04/29/simrank/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="古月残辉">
      <meta itemprop="description" content>
      <meta itemprop="image" content="/hcigmoid/images/avatar.gif">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="HCigmoid">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">SimRank与视频相似度计算</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2019-04-29T09:09:50+08:00">
                2019-04-29
              </time>
            

            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/hcigmoid/categories/推荐系统/" itemprop="url" rel="index">
                    <span itemprop="name">推荐系统</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/hcigmoid/2019/04/29/simrank/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count valine-comment-count" data-xid="/hcigmoid/2019/04/29/simrank/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="page-pv"><i class="fa fa-file-o"></i> 阅读数
            <span class="busuanzi-value" id="busuanzi_value_page_pv"></span>
            </span>
          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  2.8k
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  10
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h2 id="一、应用背景"><a href="#一、应用背景" class="headerlink" title="一、应用背景"></a>一、应用背景</h2><p>最近需要对视频的相关推荐进行一些优化。之前尝试过 TagSim、AutoEncoder 和 Word2Vec 等方法，无非是基于元数据相似或基于协同相似的思路。但是在实际应用的时候，由于媒资传过来的信息未必是非常准确的，因此基于元数据相似的方法在数据基础上可能就存在一定的不确定性，因此常常会推出来一些虽然实际上很符合算法预期，但是看起来很奇怪的结果。而基于协同相似的推荐，由于需要比较多的行为数据来估计视频之间的相似度，又往往只能覆盖少量的视频。在应用中，我们往往使用的是两者的混合，但是由于混合比较简单粗暴，仍然有很多 VOC 问题。</p>
<p>因此，团队迫切的需要一种能够提升相关推荐效果的模型。而这种相关又是有强业务语义的，需要能够支持灵活的定制，因此在短时间内先不考虑深度网络（可解释性太差）。在调研中，发现有基于热传导的算法，感觉好像挺符合直观感觉，用了协同数据，同时也支持元数据。但是再顺着这个思路往下找的时候，发现  <strong>SimRank</strong> 是一种十分成熟且常用于相关推荐的模型，粗看了一下，感觉很符合我们的业务诉求，就迫不及待尝试了一下。</p>
<a id="more"></a>
<h2 id="二、SimRank-基本模型"><a href="#二、SimRank-基本模型" class="headerlink" title="二、SimRank 基本模型"></a>二、SimRank 基本模型</h2><h3 id="2-1-核心思想"><a href="#2-1-核心思想" class="headerlink" title="2.1 核心思想"></a>2.1 核心思想</h3><p>由于 <strong>SimRank</strong> 提出的时间比较早，网上的材料很多，而且大多长的也差不多，可以参考文献 [1, 2] ，这里只简单的搬个砖。</p>
<p>文献 [1] 最早提出  <strong>SimRank</strong> 模型，核心的思想是 “<strong>two objects are similar if they are related to similar objects</strong>“（这跟 PageRank 的思路完全一致，只是 PageRank 是用来评估每个链接的重要性，而  <strong>SimRank</strong> 是用来评估每两个物品间的相似度）。 <strong>SimRank</strong> 既支持计算所有节点对之间的相似度（如输入数据为文章引用记录），也支持计算二部图中每一部分节点间的相似度（如输入数据为用户行为记录）。由于我们是做视频推荐，主要用的是用户行为数据，因此这里只介绍基于二部图的模型。</p>
<p>举个简单的例子：如下图所示，用户 $u_1$ 观看了视频 $v_1,v_2,v_3$；用户 $u_2$ 观看了视频 $v_2,v_3,v_4$，则可以用二部图来表示这种观影关系（二部图是因为用户 $u_1,u_2$ 之间无联系，且视频 $v_1,v_2,v_3,v_4$ 间无联系，只有用户-视频间存在有向边）：</p>
<img src="/hcigmoid/2019/04/29/simrank/simrank-f1.png" title="用户观影二部图示例">
<p>为了评估视频 $v_1,v_4$ 之间的相似度，需要看看哪些人看了 $v_1,v_4$ ，以及这些用户的相似度。这是一个典型的递归逻辑，递归的起点在于：每个节点（包括这里的用户/视频）与自己的相似度为 1；没有关联的节点间相似度为 0（一种情况是这两个节点没有与其他节点的联系，还有一种情况是在迭代的初始状态时，所有节点对间的相似度为 0）。值得注意的是，如果用 ItemCF 算法来计算 $v_1,v_4$ 的相似度，由于它们没有共同观看的用户，相似度为 0，具体对比可以参考我之前的博客：<a href="https://guyuecanhui.github.io/2019/04/12/itemcf/" target="_blank" rel="noopener">可能是最好懂的ItemCF解释了</a>。</p>
<h3 id="2-2-基于二部图的描述"><a href="#2-2-基于二部图的描述" class="headerlink" title="2.2 基于二部图的描述"></a>2.2 基于二部图的描述</h3><p>最直观和容易理解的是基于图的描述。用数学语言来表达上面的思路：</p>
<script type="math/tex; mode=display">
\begin{cases}
s(u,u')=\frac{c_1}{|O(u)|\cdot|O(u')|}\sum_{i\in O(u)}\sum_{j\in O(u')}s(i, j) \\
s(v,v')=\frac{c_2}{|I(v)|\cdot|I(v')|}\sum_{i\in I(v)}\sum_{j\in I(v')}s(i, j)
\end{cases} \quad (1)</script><p>其中，$u$ 表示用户，$v$ 表示视频，$O(u)$ 表示用户 $u$ 观看过的视频集合，$I(v)$ 表示视频的观看用户集合，$s(i,j)$ 表示两个节点的相似度，$c_i$ 为常数系数。式 (1) 中累加相似度的部分不是很好理解，实际上就是对两个节点所有关联的节点进行两两组合计算相似度之和。$c_1, c_2$ 可以理解成相似度的传导率，传导率越大，受到相邻节点影响也就越大，每轮迭代相似度的传播也就越快，表现为迭代若干轮后，节点间的相似度越高（文献 [1] 中建议的是0.8）。如果使用随机游走的方法，则传导率越大，下一个状态转移到相邻节点的概率越大，即下一个状态保持原来节点概率越小。</p>
<p>在实现模型的时候，可以直接在图上按公式  (1)  进行计算，但是需要注意缓存中间结果$^{[3]}$，否则存在很多重复计算，实测中，不做什么优化的话，超过 $10000\times10000$ 的二部图单机基本就几个小时都算不出来了。</p>
<h3 id="2-3-基于矩阵的描述"><a href="#2-3-基于矩阵的描述" class="headerlink" title="2.3 基于矩阵的描述"></a>2.3 基于矩阵的描述</h3><p>另外一种等价的描述是将图转化成矩阵，比如原来的二部图是 $G = (U, V, E)$，即共 $n_u + n_v$ 个节点，可以转化成 $(n_u + n_v) \times (n_u + n_v)$ 的状态转移矩阵 $W$。根据公式 (1) 的描述，图中的每一条边对应于转移矩阵的一个元素（这里实现的时候用户和视频一般是分开连续编号的），从而可以设置转移矩阵为： </p>
<script type="math/tex; mode=display">
\begin{cases}
w(u,v)=\frac{1}{|O(u)|} \\
w(v,u)=\frac{1}{|I(v)|}
\end{cases}</script><p>转移矩阵中其他元素为 0。而根据定义，相似度矩阵 $S$ 中对角线始终为 1，其他元素初始化为 0。则基于矩阵的迭代过程可以用下式来表达：</p>
<script type="math/tex; mode=display">
S = C\cdot W^T\cdot S\cdot W + (I-diag(C\cdot W^T\cdot S\cdot W)) \quad (2)</script><p>其中，矩阵 $C$ 的对角线元素为 <script type="math/tex">c_1</script> 或 <script type="math/tex">c_2</script>，如果 <script type="math/tex">c_1=c_2=c</script>，那 $C$ 可以直接用系数 $c$ 来代替。公式 (2) 的前一部分就是公式 (1) 的矩阵描述，后一部分实际上是为了设置每轮迭代时，相似矩阵的对角线为 1，即 <script type="math/tex">s_{i,i}=1</script>。</p>
<p>注意到，在二部图的情况下，<code>用户-视频</code>的相似度必然是 0，同时，<code>用户-用户</code> / <code>视频-视频</code>的转移矩阵也必然是 0。因此相似矩阵和转移矩阵可以简单的拆成<code>用户-用户</code>相似矩阵 <script type="math/tex">S_u</script>、<code>视频-视频</code> <script type="math/tex">S_v</script> 相似矩阵以及<code>用户-视频</code>转移矩阵 <script type="math/tex">W_{uv}</script>、<code>视频-用户</code>转移矩阵 $W_{vu}$，并做分块乘法。简单的推导一下：</p>
<script type="math/tex; mode=display">
\begin{equation}\begin{aligned}
W^T\cdot S\cdot W &=
\left[
\begin{array}{cc}
0 & W_{vu}^T \\
W_{uv}^T & 0
\end{array}
\right] \cdot
\left[
\begin{array}{cc}
S_u & 0 \\
0 & S_v
\end{array}
\right] \cdot
\left[
\begin{array}{cc}
0 & W_{uv} \\
W_{vu} & 0
\end{array}
\right] \\
&=\left[
\begin{array}{cc}
0 & W_{vu}^T\cdot S_v \cdot W_{vu} \\
W_{uv}^T\cdot S_u \cdot W_{uv} & 0
\end{array}
\right]
\end{aligned}\end{equation}</script><p>仔细品味一下这个公式，能更直观的了解相似度的传递过程。因此，迭代计算公式为：</p>
<script type="math/tex; mode=display">
\begin{cases}
S_u^{k+1} = c_1 \cdot W_{vu}^T\cdot S_v^k \cdot W_{vu} + (I - diag(c_1 \cdot W_{vu}^T\cdot S_v^k \cdot W_{vu})) \\
S_v^{k+1} = c_2 \cdot W_{uv}^T\cdot S_u^k \cdot W_{uv} + (I-diag(c_2\cdot W_{uv}^T\cdot S_u^k \cdot W_{uv}))
\end{cases}\quad (3)</script><h3 id="2-4-扩展用户-视频属性"><a href="#2-4-扩展用户-视频属性" class="headerlink" title="2.4 扩展用户/视频属性"></a>2.4 扩展用户/视频属性</h3><p>以上描述了经典的基于二部图的  <strong>SimRank</strong> 算法，但是其实我们可以将视频的元数据/用户的属性数据作为辅助节点加入到图中来，并添加<code>视频元数据</code>$\rightarrow$<code>视频</code>和<code>用户画像</code>$\rightarrow$<code>用户</code>的单向边（表示用户/视频的相似度不会反向传播给画像/元数据），同时初始化不同维度的视频元数据/用户画像的相似度，以达到运营干预的目的。具体的分块乘法就不推导了，跟 2.3 节差不多，这里只举一个例子：</p>
<img src="/hcigmoid/2019/04/29/simrank/simrank-f2.png" title="扩展用户/视频属性后的二部图示例">
<p>上图中，本来 $u_1,u_2$ 之间是没有边相连的，因此相似度为 0，但是由于他们同属男性，因此由<code>男性</code>这个画像向这两个用户传播了一定的相似度；同样的，本来 $v_1,v_2$ 之间的相似度也为 0，但是由于它们都是搞笑的视频，因此<code>搞笑</code>这个元数据也向它们传播了一定的相似度。<strong>加入用户维度和视频维度的辅助节点以后，有助于解决由于行为较少而无法准确评估相似度的情况。</strong></p>
<h2 id="三、模型实现与优化讨论"><a href="#三、模型实现与优化讨论" class="headerlink" title="三、模型实现与优化讨论"></a>三、模型实现与优化讨论</h2><p>我在 spark 2 和在 python 中分别实现了上述过程，使用图遍历的方式优点是代码简单，但是对于大规模的图优化比较麻烦，速度很慢；使用矩阵计算的时候，主要的问题又在于矩阵的优化计算。下面简单讲下一些可行的优化思路。</p>
<h5 id="a-基于-spark-的精确计算"><a href="#a-基于-spark-的精确计算" class="headerlink" title="a. 基于 spark 的精确计算"></a>a. 基于 spark 的精确计算</h5><p>如果使用 mllib 的 <code>BlockMatrix</code> 来计算，会强制将稀疏矩阵转成稠密矩阵来计算，因此开销比实际需要的大很多，因此一定要使用公式 (3) 来替代公式 (2)。但是即使这样，也不能从根本上解决问题，根本上是需要自己实现一套高效的分布式稀疏矩阵的计算方法，网上有一些开源项目可参考。</p>
<h5 id="b-基于-python-的精确计算"><a href="#b-基于-python-的精确计算" class="headerlink" title="b. 基于 python 的精确计算"></a>b. 基于 python 的精确计算</h5><p>使用 python 进行计算时，由于相似度的精度要求不高，因此使用 <code>np.float16</code> 就足够了，并且每轮迭代完，要将小于一定阈值的相似项置 0（如果只需要计算 topN相似的话，每轮可以只保留系数最大的 topN 项）。另外，构建矩阵用 <code>csr_matrix</code> 比较方便，计算的时候还是得用 <code>li_matrix</code>。</p>
<h5 id="c-迭代近似"><a href="#c-迭代近似" class="headerlink" title="c. 迭代近似"></a>c. 迭代近似</h5><p>由于我们只需要算视频的相似度，有一种解决上面问题的思路是将用户随机分成若干份，用这些用户的数据来计算视频的相似矩阵，然后将这些相似矩阵加起来求平均，但是效果不是很好。</p>
<h5 id="d-矩阵分析"><a href="#d-矩阵分析" class="headerlink" title="d. 矩阵分析"></a>d. 矩阵分析</h5><p>针对矩阵运算，可以预先分析矩阵的特点，然后再采用一定的手段来减少总计算量。这里涉及一些矩阵分解的优化方法，以后有机会再仔细研究研究。</p>
<h2 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h2><p>[1] Jeh, G., &amp; Widom, J. (2002, July). SimRank: a measure of structural-context similarity. In <em>Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 538-543). ACM.</p>
<p>[2] SimRank协同过滤推荐算法: <a href="http://www.cnblogs.com/pinard/p/6362647.html" target="_blank" rel="noopener">http://www.cnblogs.com/pinard/p/6362647.html</a>.</p>
<p>[3] Lizorkin, D., Velikhov, P., Grinev, M., &amp; Turdakov, D. (2008). Accuracy estimate and optimization techniques for simrank computation. <em>Proceedings of the VLDB Endowment</em>, <em>1</em>(1), 422-433.</p>

      
    </div>
    
    
    

    

    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/hcigmoid/tags/推荐/" rel="tag"># 推荐</a>
          
            <a href="/hcigmoid/tags/协同过滤/" rel="tag"># 协同过滤</a>
          
            <a href="/hcigmoid/tags/算法/" rel="tag"># 算法</a>
          
            <a href="/hcigmoid/tags/相似度/" rel="tag"># 相似度</a>
          
            <a href="/hcigmoid/tags/二部图/" rel="tag"># 二部图</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/hcigmoid/2019/04/14/hexo-next-github/" rel="next" title="Hexo+NexT+github 配置指南">
                <i class="fa fa-chevron-left"></i> Hexo+NexT+github 配置指南
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/hcigmoid/2019/05/10/simrankpp/" rel="prev" title="从 SimRank 到 SimRank++">
                从 SimRank 到 SimRank++ <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
    </div>
  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <p class="site-author-name" itemprop="name">古月残辉</p>
              <p class="site-description motion-element" itemprop="description">总结心得</p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/hcigmoid/archives/">
              
                  <span class="site-state-item-count">32</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/hcigmoid/categories/index.html">
                  <span class="site-state-item-count">6</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/hcigmoid/tags/index.html">
                  <span class="site-state-item-count">78</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          
            <div class="feed-link motion-element">
              <a href="/hcigmoid/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
                
                  <span class="links-of-author-item">
                    <a href="mailto:guyuecanhui@icloud.com" target="_blank" title="E-Mail">
                      
                        <i class="fa fa-fw fa-envelope"></i>E-Mail</a>
                  </span>
                
            </div>
          

          
          

          
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#一、应用背景"><span class="nav-number">1.</span> <span class="nav-text">一、应用背景</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#二、SimRank-基本模型"><span class="nav-number">2.</span> <span class="nav-text">二、SimRank 基本模型</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#2-1-核心思想"><span class="nav-number">2.1.</span> <span class="nav-text">2.1 核心思想</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#2-2-基于二部图的描述"><span class="nav-number">2.2.</span> <span class="nav-text">2.2 基于二部图的描述</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#2-3-基于矩阵的描述"><span class="nav-number">2.3.</span> <span class="nav-text">2.3 基于矩阵的描述</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#2-4-扩展用户-视频属性"><span class="nav-number">2.4.</span> <span class="nav-text">2.4 扩展用户/视频属性</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#三、模型实现与优化讨论"><span class="nav-number">3.</span> <span class="nav-text">三、模型实现与优化讨论</span></a><ol class="nav-child"><li class="nav-item nav-level-5"><a class="nav-link" href="#a-基于-spark-的精确计算"><span class="nav-number">3.0.0.1.</span> <span class="nav-text">a. 基于 spark 的精确计算</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#b-基于-python-的精确计算"><span class="nav-number">3.0.0.2.</span> <span class="nav-text">b. 基于 python 的精确计算</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#c-迭代近似"><span class="nav-number">3.0.0.3.</span> <span class="nav-text">c. 迭代近似</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#d-矩阵分析"><span class="nav-number">3.0.0.4.</span> <span class="nav-text">d. 矩阵分析</span></a></li></ol></li></ol><li class="nav-item nav-level-2"><a class="nav-link" href="#参考文献"><span class="nav-number">4.</span> <span class="nav-text">参考文献</span></a></li></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; 2018 &mdash; <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">古月残辉</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
      <span class="post-meta-item-text">Site words total count&#58;</span>
    
    <span title="Site words total count">58.2k</span>
  
</div>









<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

        
<div class="busuanzi-count">
  <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js"></script>

  
    <span class="site-uv">
      <i class="fa fa-user"></i> 访问人数
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
      人
    </span>
  

  
    <span class="site-pv">
      <i class="fa fa-eye"></i> 总访问量
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
      次
    </span>
  
</div>








        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/hcigmoid/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/hcigmoid/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hcigmoid/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/hcigmoid/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hcigmoid/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/hcigmoid/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/hcigmoid/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/hcigmoid/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  










  <script src="//cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
  <script src="//unpkg.com/valine/dist/Valine.min.js"></script>
  
  <script type="text/javascript">
    var GUEST = ['nick','mail','link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item=>{
      return GUEST.indexOf(item)>-1;
    });
    new Valine({
        el: '#comments' ,
        verify: false,
        notify: false,
        appId: '6du4Ppc2TvUuhcccRHSDNH2v-gzGzoHsz',
        appKey: 'zOKNml4W1Bq3OTzEuLt5hUjI',
        placeholder: '感谢阅读！欢迎评论！',
        avatar:'mm',
        guest_info:guest,
        pageSize:'10' || 10,
    });
  </script>



  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/hcigmoid/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  
  

  
  
    <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: {
          inlineMath: [ ['$','$'], ["\\(","\\)"]  ],
          processEscapes: true,
          skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
        }
      });
    </script>

    <script type="text/x-mathjax-config">
      MathJax.Hub.Queue(function() {
        var all = MathJax.Hub.getAllJax(), i;
        for (i=0; i < all.length; i += 1) {
          all[i].SourceElement().parentNode.className += ' has-jax';
        }
      });
    </script>
    <script type="text/javascript" src="//cdn.bootcss.com/mathjax/2.7.1/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
  


  

  

</body>
</html>
